import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
hosp_det = pd.read_csv(r"E:\DATA SCIENCE\CapStone\Project1\Hospitalisation details.csv")
med_exams = pd.read_csv(r"E:\DATA SCIENCE\CapStone\Project1\Medical Examinations.csv")
name_data = pd.read_excel(r"E:\DATA SCIENCE\CapStone\Project1\Names.xlsx")
hosp_det
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2338 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | tier - 1 | tier - 2 | R1012 |
| 2339 | Id4 | 1991 | Jun | 6 | 1 | 58571.07 | tier - 1 | tier - 3 | R1024 |
| 2340 | Id3 | 1970 | ? | 11 | 3 | 60021.40 | tier - 1 | tier - 1 | R1012 |
| 2341 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | tier - 2 | tier - 3 | R1013 |
| 2342 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | tier - 1 | tier - 3 | R1013 |
2343 rows × 9 columns
med_exams
| Customer ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | |
|---|---|---|---|---|---|---|---|---|
| 0 | Id1 | 47.410 | 7.47 | No | No | No | No major surgery | yes |
| 1 | Id2 | 30.360 | 5.77 | No | No | No | No major surgery | yes |
| 2 | Id3 | 34.485 | 11.87 | yes | No | No | 2 | yes |
| 3 | Id4 | 38.095 | 6.05 | No | No | No | No major surgery | yes |
| 4 | Id5 | 35.530 | 5.45 | No | No | No | No major surgery | yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2330 | Id2331 | 22.340 | 5.57 | No | No | No | 1 | No |
| 2331 | Id2332 | 17.700 | 6.28 | No | No | No | 1 | No |
| 2332 | Id2333 | 16.470 | 6.35 | No | No | Yes | 1 | No |
| 2333 | Id2334 | 17.600 | 4.39 | No | No | No | 1 | No |
| 2334 | Id2335 | 17.580 | 4.51 | No | No | No | 1 | No |
2335 rows × 8 columns
name_data
| Customer ID | name | |
|---|---|---|
| 0 | Id1 | Hawks, Ms. Kelly |
| 1 | Id2 | Lehner, Mr. Matthew D |
| 2 | Id3 | Lu, Mr. Phil |
| 3 | Id4 | Osborne, Ms. Kelsey |
| 4 | Id5 | Kadala, Ms. Kristyn |
| ... | ... | ... |
| 2330 | Id2331 | Brietzke, Mr. Jordan |
| 2331 | Id2332 | Riveros Gonzalez, Mr. Juan D. Sr. |
| 2332 | Id2333 | Albano, Ms. Julie |
| 2333 | Id2334 | Rosendahl, Mr. Evan P |
| 2334 | Id2335 | German, Mr. Aaron K |
2335 rows × 2 columns
hosp_det.describe()
| date | children | charges | |
|---|---|---|---|
| count | 2343.000000 | 2343.000000 | 2343.000000 |
| mean | 15.554844 | 1.026035 | 13559.067870 |
| std | 8.721194 | 1.233847 | 11922.658415 |
| min | 1.000000 | 0.000000 | 563.840000 |
| 25% | 8.000000 | 0.000000 | 5084.010000 |
| 50% | 15.000000 | 0.000000 | 9634.540000 |
| 75% | 23.000000 | 2.000000 | 17029.675000 |
| max | 30.000000 | 5.000000 | 63770.430000 |
med_exams.describe()
| BMI | HBA1C | |
|---|---|---|
| count | 2335.000000 | 2335.000000 |
| mean | 30.972649 | 6.578998 |
| std | 8.742095 | 2.228731 |
| min | 15.010000 | 4.000000 |
| 25% | 24.600000 | 4.900000 |
| 50% | 30.400000 | 5.810000 |
| 75% | 36.300000 | 7.955000 |
| max | 55.050000 | 12.000000 |
name_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 name 2335 non-null object dtypes: object(2) memory usage: 36.6+ KB
hosp_det.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2343 entries, 0 to 2342 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2343 non-null object 1 year 2343 non-null object 2 month 2343 non-null object 3 date 2343 non-null int64 4 children 2343 non-null int64 5 charges 2343 non-null float64 6 Hospital tier 2343 non-null object 7 City tier 2343 non-null object 8 State ID 2343 non-null object dtypes: float64(1), int64(2), object(6) memory usage: 164.9+ KB
med_exams.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 BMI 2335 non-null float64 2 HBA1C 2335 non-null float64 3 Heart Issues 2335 non-null object 4 Any Transplants 2335 non-null object 5 Cancer history 2335 non-null object 6 NumberOfMajorSurgeries 2335 non-null object 7 smoker 2335 non-null object dtypes: float64(2), object(6) memory usage: 146.1+ KB
Collate the files so that all the information is in one place¶
dataset = pd.merge(hosp_det, med_exams, on="Customer ID")
dataset = pd.merge(dataset, name_data, on="Customer ID")
dataset.head()
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 | 17.58 | 4.51 | No | No | No | 1 | No | German, Mr. Aaron K |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 | 17.60 | 4.39 | No | No | No | 1 | No | Rosendahl, Mr. Evan P |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 | 16.47 | 6.35 | No | No | Yes | 1 | No | Albano, Ms. Julie |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 | 17.70 | 6.28 | No | No | No | 1 | No | Riveros Gonzalez, Mr. Juan D. Sr. |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 | 22.34 | 5.57 | No | No | No | 1 | No | Brietzke, Mr. Jordan |
dataset.isnull().sum().sum()
0
dataset.shape
(2335, 17)
Percentage of rows that have trivial value, and delete such rows if they do not contain significant information¶
trivial_rows = dataset.isin(['?']).any(axis=1)
num_trivial_rows = trivial_rows.sum()
total_rows = dataset.shape[0]
percentage_trivial = (num_trivial_rows / total_rows)*100
print(f"Percentage of rows with trivial values: {percentage_trivial*100}%")
Percentage of rows with trivial values: 42.82655246252677%
dataset = dataset[~trivial_rows]
print(dataset.shape)
(2325, 17)
Transform the nominal and ordinal categorical variables¶
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
nominal_vars = ['Heart Issues', 'Any Transplants', 'Cancer history', 'smoker']
for var in nominal_vars:
dataset[var] = le.fit_transform(dataset[var])
hospital_tier_mapping = {'tier - 1': 1, 'tier - 2': 2, 'tier - 3': 3}
city_tier_mapping = {'tier - 1': 1, 'tier - 2': 2, 'tier - 3': 3}
dataset['Hospital tier'] = dataset['Hospital tier'].map(hospital_tier_mapping)
dataset['City tier'] = dataset['City tier'].map(city_tier_mapping)
dataset
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | 2 | 3 | R1013 | 17.580 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | 2 | 1 | R1013 | 17.600 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | 2 | 1 | R1013 | 16.470 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | 3 | 3 | R1013 | 17.700 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | 3 | 3 | R1013 | 22.340 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2329 | Id6 | 1962 | Aug | 4 | 0 | 52590.83 | 1 | 3 | R1011 | 32.800 | 6.59 | 0 | 0 | 0 | No major surgery | 1 | Baker, Mr. Russell B. |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | 1 | 2 | R1012 | 35.530 | 5.45 | 0 | 0 | 0 | No major surgery | 1 | Kadala, Ms. Kristyn |
| 2331 | Id4 | 1991 | Jun | 6 | 1 | 58571.07 | 1 | 3 | R1024 | 38.095 | 6.05 | 0 | 0 | 0 | No major surgery | 1 | Osborne, Ms. Kelsey |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | 2 | 3 | R1013 | 30.360 | 5.77 | 0 | 0 | 0 | No major surgery | 1 | Lehner, Mr. Matthew D |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | 1 | 3 | R1013 | 47.410 | 7.47 | 0 | 0 | 0 | No major surgery | 1 | Hawks, Ms. Kelly |
2325 rows × 17 columns
Given the imbalanced representation of states (16) and potential insignificance of many dummy variables, focus on R1011, R1012, and R1013 for further analysis. Develop a strategy to create dummy variables for these specific regions.¶
dataset['State ID'].value_counts()
State ID R1013 609 R1011 574 R1012 572 R1024 159 R1026 84 R1021 70 R1016 64 R1025 40 R1023 38 R1017 36 R1019 26 R1022 14 R1014 13 R1015 11 R1018 9 R1020 6 Name: count, dtype: int64
dataset['group_state'] = np.where(dataset['State ID'].isin(['R1011', 'R1012', 'R1013']), dataset['State ID'], 'other')
dummie_state = pd.get_dummies(dataset['group_state'], prefix='state')
df = pd.concat([dataset,dummie_state],axis=1)
dataset
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | group_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | 2 | 3 | R1013 | 17.580 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K | R1013 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | 2 | 1 | R1013 | 17.600 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | R1013 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | 2 | 1 | R1013 | 16.470 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie | R1013 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | 3 | 3 | R1013 | 17.700 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | R1013 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | 3 | 3 | R1013 | 22.340 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | R1013 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2329 | Id6 | 1962 | Aug | 4 | 0 | 52590.83 | 1 | 3 | R1011 | 32.800 | 6.59 | 0 | 0 | 0 | No major surgery | 1 | Baker, Mr. Russell B. | R1011 |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | 1 | 2 | R1012 | 35.530 | 5.45 | 0 | 0 | 0 | No major surgery | 1 | Kadala, Ms. Kristyn | R1012 |
| 2331 | Id4 | 1991 | Jun | 6 | 1 | 58571.07 | 1 | 3 | R1024 | 38.095 | 6.05 | 0 | 0 | 0 | No major surgery | 1 | Osborne, Ms. Kelsey | other |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | 2 | 3 | R1013 | 30.360 | 5.77 | 0 | 0 | 0 | No major surgery | 1 | Lehner, Mr. Matthew D | R1013 |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | 1 | 3 | R1013 | 47.410 | 7.47 | 0 | 0 | 0 | No major surgery | 1 | Hawks, Ms. Kelly | R1013 |
2325 rows × 18 columns
dataset = dataset[dataset["State ID"].isin(['R1011', 'R1012', 'R1013'])]
dataset.shape # 609+574+572 = 1755
(1755, 18)
le = LabelEncoder()
dataset.loc[:, 'State ID'] = le.fit_transform(dataset['State ID']).astype(int) #explicitly casting the output of fit_transform()
dataset['State ID'].unique()
array([2, 1, 0], dtype=object)
replace_map = {'R1011': 1, 'R1012': 2, 'R1013': 3, 'other': 0}
dataset.loc[:, 'group_state'] = dataset['group_state'].replace(replace_map).astype(int)
C:\Users\Asus\AppData\Local\Temp\ipykernel_12196\1127126202.py:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
dataset.loc[:, 'group_state'] = dataset['group_state'].replace(replace_map).astype(int)
dataset
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | group_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | 2 | 3 | 2 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K | 3 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | 2 | 1 | 2 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | 3 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | 2 | 1 | 2 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie | 3 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | 3 | 3 | 2 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | 3 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | 3 | 3 | 2 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2328 | Id7 | 1994 | Oct | 27 | 1 | 51194.56 | 1 | 3 | 0 | 36.40 | 6.07 | 0 | 0 | 0 | No major surgery | 1 | Macpherson, Mr. Scott | 1 |
| 2329 | Id6 | 1962 | Aug | 4 | 0 | 52590.83 | 1 | 3 | 0 | 32.80 | 6.59 | 0 | 0 | 0 | No major surgery | 1 | Baker, Mr. Russell B. | 1 |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | 1 | 2 | 1 | 35.53 | 5.45 | 0 | 0 | 0 | No major surgery | 1 | Kadala, Ms. Kristyn | 2 |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | 2 | 3 | 2 | 30.36 | 5.77 | 0 | 0 | 0 | No major surgery | 1 | Lehner, Mr. Matthew D | 3 |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | 1 | 3 | 2 | 47.41 | 7.47 | 0 | 0 | 0 | No major surgery | 1 | Hawks, Ms. Kelly | 3 |
1755 rows × 18 columns
The 'NumberOfMajorSurgeries' variable contains string values. Apply an appropriate method to clean and convert this variable to a numerical format.¶
replace_map = {'No major surgery': 0}
dataset.loc[:, "NumberOfMajorSurgeries"] = dataset["NumberOfMajorSurgeries"].replace(replace_map)
dataset
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | group_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | 2 | 3 | 2 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K | 3 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | 2 | 1 | 2 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | 3 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | 2 | 1 | 2 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie | 3 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | 3 | 3 | 2 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | 3 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | 3 | 3 | 2 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2328 | Id7 | 1994 | Oct | 27 | 1 | 51194.56 | 1 | 3 | 0 | 36.40 | 6.07 | 0 | 0 | 0 | 0 | 1 | Macpherson, Mr. Scott | 1 |
| 2329 | Id6 | 1962 | Aug | 4 | 0 | 52590.83 | 1 | 3 | 0 | 32.80 | 6.59 | 0 | 0 | 0 | 0 | 1 | Baker, Mr. Russell B. | 1 |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | 1 | 2 | 1 | 35.53 | 5.45 | 0 | 0 | 0 | 0 | 1 | Kadala, Ms. Kristyn | 2 |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | 2 | 3 | 2 | 30.36 | 5.77 | 0 | 0 | 0 | 0 | 1 | Lehner, Mr. Matthew D | 3 |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | 1 | 3 | 2 | 47.41 | 7.47 | 0 | 0 | 0 | 0 | 1 | Hawks, Ms. Kelly | 3 |
1755 rows × 18 columns
Age is likely a significant factor in this analysis. Calculate the age of each patient based on their date of birth.¶
month_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
dataset.loc[:, 'month'] = dataset['month'].map(month_dict)
dataset.loc[:, 'year'] = dataset['year'].astype(int)
current_year = 2023
dataset.loc[:, 'age'] = current_year - dataset['year']
C:\Users\Asus\AppData\Local\Temp\ipykernel_12196\3994024914.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy dataset.loc[:, 'age'] = current_year - dataset['year']
dataset
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | group_state | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | 7 | 9 | 0 | 563.84 | 2 | 3 | 2 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K | 3 | 31 |
| 1 | Id2334 | 1992 | 11 | 30 | 0 | 570.62 | 2 | 1 | 2 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | 3 | 31 |
| 2 | Id2333 | 1993 | 6 | 30 | 0 | 600.00 | 2 | 1 | 2 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie | 3 | 30 |
| 3 | Id2332 | 1992 | 9 | 13 | 0 | 604.54 | 3 | 3 | 2 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | 3 | 31 |
| 4 | Id2331 | 1998 | 7 | 27 | 0 | 637.26 | 3 | 3 | 2 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | 3 | 25 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2328 | Id7 | 1994 | 10 | 27 | 1 | 51194.56 | 1 | 3 | 0 | 36.40 | 6.07 | 0 | 0 | 0 | 0 | 1 | Macpherson, Mr. Scott | 1 | 29 |
| 2329 | Id6 | 1962 | 8 | 4 | 0 | 52590.83 | 1 | 3 | 0 | 32.80 | 6.59 | 0 | 0 | 0 | 0 | 1 | Baker, Mr. Russell B. | 1 | 61 |
| 2330 | Id5 | 1989 | 6 | 19 | 0 | 55135.40 | 1 | 2 | 1 | 35.53 | 5.45 | 0 | 0 | 0 | 0 | 1 | Kadala, Ms. Kristyn | 2 | 34 |
| 2333 | Id2 | 1977 | 6 | 8 | 0 | 62592.87 | 2 | 3 | 2 | 30.36 | 5.77 | 0 | 0 | 0 | 0 | 1 | Lehner, Mr. Matthew D | 3 | 46 |
| 2334 | Id1 | 1968 | 10 | 12 | 0 | 63770.43 | 1 | 3 | 2 | 47.41 | 7.47 | 0 | 0 | 0 | 0 | 1 | Hawks, Ms. Kelly | 3 | 55 |
1755 rows × 19 columns
To potentially identify factors influencing hospitalization costs, we'll create a new field for the beneficiary's gender. This can be inferred from the salutations present in their name.¶
dataset.loc[:, 'Gender'] = dataset['name'].apply(lambda x: '0' if 'Mr.' in x else '1')
dataset.head()
C:\Users\Asus\AppData\Local\Temp\ipykernel_12196\2122272058.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy dataset.loc[:, 'Gender'] = dataset['name'].apply(lambda x: '0' if 'Mr.' in x else '1')
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | group_state | age | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | 7 | 9 | 0 | 563.84 | 2 | 3 | 2 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K | 3 | 31 | 0 |
| 1 | Id2334 | 1992 | 11 | 30 | 0 | 570.62 | 2 | 1 | 2 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | 3 | 31 | 0 |
| 2 | Id2333 | 1993 | 6 | 30 | 0 | 600.00 | 2 | 1 | 2 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie | 3 | 30 | 1 |
| 3 | Id2332 | 1992 | 9 | 13 | 0 | 604.54 | 3 | 3 | 2 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | 3 | 31 | 0 |
| 4 | Id2331 | 1998 | 7 | 27 | 0 | 637.26 | 3 | 3 | 2 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | 3 | 25 | 0 |
To visualize the distribution of costs, create a histogram, box plot, and swarm plot.¶
plt.figure(figsize=(13,6))
ax = sns.histplot(dataset["charges"], kde=False, bins=20)
for p in ax.patches:
height = p.get_height()
x = p.get_x() + p.get_width() / 2
ax.text(x, height + 2, f'{int(height)}', ha='center', va='bottom', fontsize=13)
plt.title('Distribution of costs')
plt.show()
plt.figure(figsize=(9,4))
sns.boxplot(x=dataset["charges"])
plt.title('Distribution of costs')
plt.show()
Analyze how the distribution of costs varies across different genders and hospital tiers¶
plt.figure(figsize=(15,8))
sns.swarmplot(dataset["charges"])
plt.title("Distribution of costs")
plt.show()
plt.figure(figsize=(15,8))
sns.swarmplot(x = 'charges', y = 'Gender', data = dataset)
plt.title('Swarm Plot of Charges by Gender')
plt.show()
plt.figure(figsize=(15,5))
sns.violinplot(x = "City tier", y = "charges", data = dataset)
plt.title('Violin Plot of Charges by City Tier')
plt.show()
Create a radar chart to visualize the median hospitalization cost for each hospital tier.¶
import plotly.express as ex
median = dataset.groupby('Hospital tier')[['charges']].median().reset_index()
fig = ex.line_polar(median,r='charges',theta='Hospital tier',line_close=True,title='Median Hospitalization Cost by Hospital Tier')
fig.show()
Create a frequency table and a stacked bar chart to visualize the distribution of people across different city tiers and hospital tiers.¶
Freq_table = pd.crosstab(dataset['City tier'],dataset['Hospital tier'])
Freq_table
| Hospital tier | 1 | 2 | 3 |
|---|---|---|---|
| City tier | |||
| 1 | 64 | 317 | 160 |
| 2 | 89 | 365 | 157 |
| 3 | 91 | 348 | 164 |
Freq_table.plot(kind='bar',stacked=True)
plt.xlabel('City tiers')
plt.ylabel('Hospital tiers')
plt.title('Patient Distribution by City and Hospital Tier')
for idx, tier in enumerate(Freq_table.index):
bottom = np.zeros(len(Freq_table.columns))
for col in Freq_table.columns:
count = Freq_table.loc[tier, col]
plt.text(idx, bottom[idx] + count / 2, str(count), ha='center', va='center', color='white', fontsize=10)
bottom[idx] += count
plt.show()
import scipy.stats as stats
print('Null Hypothesis : Average hospitalization costs for the three types of hospitals are not significantly different.')
print()
# ANOVA
f_val, p_val = stats.f_oneway(
dataset[dataset['Hospital tier'] == 1]['charges'],
dataset[dataset['Hospital tier'] == 2]['charges'],
dataset[dataset['Hospital tier'] == 3]['charges']
)
print('P-value:', p_val)
print()
if p_val < 0.05:
print("Reject null hypothesis")
else:
print("Accept null hypothesis")
Null Hypothesis : Average hospitalization costs for the three types of hospitals are not significantly different. P-value: 5.599708129057081e-137 Reject null hypothesis
B. The average hospitalization costs for the three types of cities are not significantly different¶
print('Null Hypothesis : The average hospitalization costs for the three types of cities are not significantly different.')
print()
# ANOVA
f_val, p_val = stats.f_oneway(
dataset[dataset['City tier'] == 1]['charges'],
dataset[dataset['City tier'] == 2]['charges'],
dataset[dataset['City tier'] == 3]['charges']
)
print('P-value:', p_val)
print()
if p_val < 0.05:
print("Reject null hypothesis")
else:
print("Accept null hypothesis")
Null Hypothesis : The average hospitalization costs for the three types of cities are not significantly different. P-value: 0.27860357802694374 Accept null hypothesis
C. The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers¶
print('Null Hypothesis : The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers.')
print()
smokers_charges = dataset[dataset['smoker'] == 1]['charges']
nonsmokers_charges = dataset[dataset['smoker'] == 0]['charges']
t_val, p_val = stats.ttest_ind(smokers_charges, nonsmokers_charges, equal_var=False)
print('P-value:', p_val)
print()
if p_val < 0.05:
print("Reject null hypothesis")
else:
print("Accept null hypothesis")
Null Hypothesis : The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers. P-value: 4.3732993882662876e-229 Reject null hypothesis
D. Smoking and heart issues are independent¶
print('Null Hypothesis : Smoking and heart issues are independent.')
print()
contingency_table = pd.crosstab(dataset['smoker'], dataset['Heart Issues'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print('Chi-square Statistic:', chi2)
print()
print('P-value:', p)
print()
if p < 0.05:
print("Reject null hypothesis")
else:
print("Accept null hypothesis")
Null Hypothesis : Smoking and heart issues are independent. Chi-square Statistic: 0.012577038637804683 P-value: 0.9107065371179246 Accept null hypothesis
Machine Learning Part¶
1. Identify and visualize highly correlated predictors using a heatmap.¶
dataset = dataset.loc[:, ~dataset.columns.isin(['Customer ID', 'name'])]
dataset
| year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | group_state | age | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1992 | 7 | 9 | 0 | 563.84 | 2 | 3 | 2 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | 3 | 31 | 0 |
| 1 | 1992 | 11 | 30 | 0 | 570.62 | 2 | 1 | 2 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | 3 | 31 | 0 |
| 2 | 1993 | 6 | 30 | 0 | 600.00 | 2 | 1 | 2 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | 3 | 30 | 1 |
| 3 | 1992 | 9 | 13 | 0 | 604.54 | 3 | 3 | 2 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | 3 | 31 | 0 |
| 4 | 1998 | 7 | 27 | 0 | 637.26 | 3 | 3 | 2 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | 3 | 25 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2328 | 1994 | 10 | 27 | 1 | 51194.56 | 1 | 3 | 0 | 36.40 | 6.07 | 0 | 0 | 0 | 0 | 1 | 1 | 29 | 0 |
| 2329 | 1962 | 8 | 4 | 0 | 52590.83 | 1 | 3 | 0 | 32.80 | 6.59 | 0 | 0 | 0 | 0 | 1 | 1 | 61 | 0 |
| 2330 | 1989 | 6 | 19 | 0 | 55135.40 | 1 | 2 | 1 | 35.53 | 5.45 | 0 | 0 | 0 | 0 | 1 | 2 | 34 | 1 |
| 2333 | 1977 | 6 | 8 | 0 | 62592.87 | 2 | 3 | 2 | 30.36 | 5.77 | 0 | 0 | 0 | 0 | 1 | 3 | 46 | 0 |
| 2334 | 1968 | 10 | 12 | 0 | 63770.43 | 1 | 3 | 2 | 47.41 | 7.47 | 0 | 0 | 0 | 0 | 1 | 3 | 55 | 1 |
1755 rows × 18 columns
plt.figure(figsize=(15, 16))
sns.heatmap(dataset.corr(), square=True, annot=True,linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show() #Here, we'll have values b/w -1 to 1
2. Develop a regression model Linear or Ridge. Evaluate the model with k-fold cross validation.¶
Also, ensure that you apply all the following suggestions:
• Implement the stratified 5-fold cross validation technique for both model building and validation
• Utilize effective standardization techniques and hyperparameter tuning
• Incorporate sklearn-pipelines to streamline the workflow
• Apply appropriate regularization techniques to address the bias-variance trade-off
• Create five folds in the data, and introduce a variable to identify the folds
• Develop Gradient Boost model and determine the variable importance scores, and identify the redundant variables
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
X = dataset.drop(columns=['charges']) #bcz we have to predict charges
y = dataset['charges']
y
0 563.84
1 570.62
2 600.00
3 604.54
4 637.26
...
2328 51194.56
2329 52590.83
2330 55135.40
2333 62592.87
2334 63770.43
Name: charges, Length: 1755, dtype: float64
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
kf = KFold(n_splits=5, shuffle=True, random_state=42)
linear_pipeline = Pipeline([('scaler',StandardScaler()),('linear_regression',LinearRegression())])
linear_pipeline
Pipeline(steps=[('scaler', StandardScaler()),
('linear_regression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('linear_regression', LinearRegression())])StandardScaler()
LinearRegression()
ridge_pipeline = Pipeline([('scaler',StandardScaler()),('ridge_regression',Ridge())])
ridge_pipeline
Pipeline(steps=[('scaler', StandardScaler()), ('ridge_regression', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('ridge_regression', Ridge())])StandardScaler()
Ridge()
ridge_params = {'ridge_regression__alpha':[0.1,1.0,10.0,20.0,100.0]}
ridge_grid_search = GridSearchCV(ridge_pipeline,ridge_params,cv=kf)
linear_scores = []
ridge_scores = []
for train_index, test_index in kf.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
linear_pipeline.fit(X_train, y_train)
y_pred_linear = linear_pipeline.predict(X_test)
linear_scores.append(mean_squared_error(y_test, y_pred_linear))
ridge_grid_search.fit(X_train, y_train)
y_pred_ridge = ridge_grid_search.predict(X_test)
ridge_scores.append(mean_squared_error(y_test, y_pred_ridge))
# Calculate and print the average RMSE
print(f'Linear Regression RMSE: {np.mean(linear_scores)**0.5:.4f}')
print(f'Ridge Regression RMSE: {np.mean(ridge_scores)**0.5:.4f}')
print(f'Best Ridge Alpha: {ridge_grid_search.best_params_["ridge_regression__alpha"]}')
Linear Regression RMSE: 4458.6490 Ridge Regression RMSE: 4458.5754 Best Ridge Alpha: 1.0
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
gbr_pipeline = Pipeline([
('scaler', StandardScaler()),
('GBR', GradientBoostingRegressor(random_state=42))
])
param_distributions = {
'GBR__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
'GBR__learning_rate': uniform(0.01, 0.2),
'GBR__n_estimators': [100, 200],
'GBR__max_depth': [3, 4, 5],
'GBR__criterion': ['friedman_mse', 'squared_error']
}
gbr_random_search = RandomizedSearchCV(
gbr_pipeline,
param_distributions,
cv=kf,
n_jobs=-1,
scoring='neg_mean_squared_error',
n_iter=10,
random_state=42
)
gbr_scores = []
for train_index, test_index in kf.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
gbr_random_search.fit(X_train, y_train)
y_pred_gbr = gbr_random_search.predict(X_test)
gbr_scores.append(mean_squared_error(y_test, y_pred_gbr))
print(f'Gradient Boosting Regressor RMSE: {np.mean(gbr_scores)**0.5:.4f}')
print(f'Best GBR Parameters: {gbr_random_search.best_params_}')
Gradient Boosting Regressor RMSE: 3541.0587
Best GBR Parameters: {'GBR__criterion': 'squared_error', 'GBR__learning_rate': 0.04636499344142013, 'GBR__loss': 'squared_error', 'GBR__max_depth': 3, 'GBR__n_estimators': 200}
3. Case scenario:¶
Estimate the cost of hospitalization for Christopher, Ms. Jayna (Date of birth 12/28/1988; height 170 cm; and weight 85 kgs). She lives with her partner and two children in a tier-1 city, and her state’s State ID is R1011. She was found to be nondiabetic (HbA1c = 5.8). She smokes but is otherwise healthy. She has had no transplants or major surgeries. Her father died of lung cancer. Hospitalization costs will be estimated using tier-1 hospitals.
from datetime import datetime
dob = datetime.strptime('12/28/1988', '%m/%d/%Y')
today = datetime.today()
age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
df = pd.DataFrame({
'year': [1988],
'month': [12],
'date': [28],
'city_tier': [1],
'children': [2],
'HbA1c': [5.8],
'smoker': [1],
'Heart Issues': [0],
'Any Transplants': [0],
'NumberOfMajorSurgeries': [0],
'Cancer history': [1],
'Hospital tier': [1],
'BMI': [85 / (1.70 ** 2)],
'age': [age],
'Gender': [0],
'state_group': [1],
'State ID_R1011': [1]
})
df
| year | month | date | city_tier | children | HbA1c | smoker | Heart Issues | Any Transplants | NumberOfMajorSurgeries | Cancer history | Hospital tier | BMI | age | Gender | state_group | State ID_R1011 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1988 | 12 | 28 | 1 | 2 | 5.8 | 1 | 0 | 0 | 0 | 1 | 1 | 29.411765 | 35 | 0 | 1 | 1 |
# training data has specific columns and some columns might be missing in df, we need to handle that.
for col in X.columns:
if col not in df.columns:
df[col] = 0
# I Reorder columns to match the training data
df = df[X.columns]
# Ridge regression prediction
ridge_pred = ridge_grid_search.predict(df)[0]
# Gradient Boosting regression prediction
gbr_pred = gbr_random_search.predict(df)[0]
print(f'Predicted Hospitalization Cost using Ridge Regression: {ridge_pred:.2f} /-')
print(f'Predicted Hospitalization Cost using Gradient Boosting: {gbr_pred:.2f} /-')
Predicted Hospitalization Cost using Ridge Regression: 31814.93 /- Predicted Hospitalization Cost using Gradient Boosting: 24107.71 /-